{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "4b481345",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = ''\n",
    "os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "1fc59091",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tokenizers import ByteLevelBPETokenizer\n",
    "from pathlib import Path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "d92e953a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/clean/filtered-dumping-academia.txt\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/clean/filtered-dumping-wiki.txt\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/clean/dumping-iium.txt\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/clean/dumping-news.txt\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/clean/dumping-parliament.txt\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/clean/dumping-pdf.txt\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/clean/dumping-watpadd.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "e23efb39",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = ByteLevelBPETokenizer(lowercase=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "2b2b34fa",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['bahasa-asr-test.txt',\n",
       " 'bahasa-asr-train.txt',\n",
       " 'dumping-iium.txt',\n",
       " 'dumping-news.txt',\n",
       " 'dumping-parliament.txt',\n",
       " 'dumping-pdf.txt',\n",
       " 'dumping-watpadd.txt',\n",
       " 'filtered-dumping-academia.txt',\n",
       " 'filtered-dumping-wiki.txt']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from glob import glob\n",
    "\n",
    "files = glob('filtered-dumping-*.txt')\n",
    "files.extend(glob('dumping-*.txt'))\n",
    "files = [f for f in files if 'twitter' not in f and 'instagram' not in f]\n",
    "files.extend(['bahasa-asr-train.txt', 'bahasa-asr-test.txt'])\n",
    "files = sorted(files)\n",
    "files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "cc2b6202",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "tokenizer.train(files=files, vocab_size=32000, min_frequency=2,\n",
    "                show_progress=True,\n",
    "                special_tokens = ['<s>', '<pad>', '</s>', '<|endoftext|>'],)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "bb11c1a2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "mkdir: cannot create directory ‘gpt2_bpe’: File exists\n"
     ]
    }
   ],
   "source": [
    "!mkdir gpt2_bpe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "a14bf05e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['gpt2_bpe/vocab.json', 'gpt2_bpe/merges.txt']"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.save_model('gpt2_bpe')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "fcb6b7a6",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "import numpy as np\n",
    "import transformers\n",
    "from transformers import GPT2Tokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "90f475c5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('malay-cased-gpt2/tokenizer_config.json',\n",
       " 'malay-cased-gpt2/special_tokens_map.json',\n",
       " 'malay-cased-gpt2/vocab.json',\n",
       " 'malay-cased-gpt2/merges.txt',\n",
       " 'malay-cased-gpt2/added_tokens.json')"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer2 = GPT2Tokenizer.from_pretrained('./gpt2_bpe', do_lower_case = False)\n",
    "tokenizer2.model_max_length = 1024\n",
    "tokenizer2.save_pretrained('malay-cased-gpt2')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "de20d583",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "PreTrainedTokenizer(name_or_path='malay-cased-gpt2', vocab_size=32000, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken(\"<|endoftext|>\", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken(\"<|endoftext|>\", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken(\"<|endoftext|>\", rstrip=False, lstrip=False, single_word=False, normalized=True)})"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "GPT2Tokenizer.from_pretrained('malay-cased-gpt2')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "6085995e",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[3795]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer2.encode('saya')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "1dc96179",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[3]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer2.encode(tokenizer2.bos_token)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "50acb470",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
     ]
    }
   ],
   "source": [
    "!head -n 10000 filtered-dumping-wiki.txt > sample-wiki.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "e5a38da9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'bahasa-asr-test.txt bahasa-asr-train.txt dumping-iium.txt dumping-news.txt dumping-parliament.txt dumping-pdf.txt dumping-watpadd.txt filtered-dumping-academia.txt filtered-dumping-wiki.txt'"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "' '.join(files)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "9b7b5bd6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
     ]
    }
   ],
   "source": [
    "cat {' '.join(files)} >> train-v2.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "06671b51",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "930544970 train-v2.txt\n"
     ]
    }
   ],
   "source": [
    "!wc -w train-v2.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "inilah dunia kecil\n",
      "itu tadi menarik ya\n",
      "pelayan restoran ini kelihatannya sibuk\n",
      "apakah anda pernah makan serangga\n",
      "aku pernah ingin menjadi seorang astronom fisika\n",
      "saya sedang mencari dompet hitam seperti ini\n",
      "apakah anda percaya pada tuhan\n",
      "semuanya sedang menunggu guru di kelas\n",
      "anak-anak kecil suka dengan buku tentang dinosaurus dan monster\n",
      "ini yang dilakukan mary untuk hidup\n"
     ]
    }
   ],
   "source": [
    "!head -n 10 train-v2.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "Ciamannacce ialah komun di jabatan Corse-du-Sud di kepulauan Corsica, Perancis.\n",
      "\n",
      "\n",
      "Coggia ialah komun di jabatan Corse-du-Sud di kepulauan Corsica, Perancis.\n",
      "\n",
      "\n",
      "Cognocoli-Monticchi ialah komun di jabatan Corse-du-Sud di kepulauan Corsica, Perancis.\n",
      "\n",
      "\n",
      "Conca ialah komun di jabatan Corse-du-Sud di kepulauan Corsica, Perancis.\n"
     ]
    }
   ],
   "source": [
    "!tail -n 10 train-v2.txt"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
